Import and check data

features = read_csv('./feature_extracted_total177.csv')
clinical = read_csv('./outcome_2020_total(1).csv') %>%
  mutate(Name = features$name)
#skimr::skim(clinical)
#skimr::skim(features) 0 missing values

pre filter features by corr

#var_0 = features %>%
#  filter(pcr == 0)
#var_1 = features %>%
#  filter(pcr != 0)
#var = colnames(features)[-c(1:2)] # may change index
#
#sig_vec = vector()
#for (variable in var) {
#  #print(variable)
#  test.sig= t.test(pull(var_0,variable),pull(var_1,variable))$p.value
#  sig_vec = append(sig_vec, test.sig)
#}
#
#sig.total = tibble(
#  var = var,
#  sig = sig_vec
#) %>%
#  filter(sig <0.1)
#
#features_into_account = sig.total$var
#length(features_into_account) #94


#t = scale(features[,3:1220])
# filter the features based on correlation
library(mlbench)
library(ranger)
# calculate correlation matrix
correlationMatrix <- cor(scale(features[,3:1220]))
# summarize the correlation matrix
#print(correlationMatrix)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff = 0.55)
# print indexes of highly correlated attributes
#print(highlyCorrelated)
length(highlyCorrelated)
## [1] 1164

correlation matrix

library(ggcorrplot)
 # correlation for the all rad-features
#pheatmap::pheatmap(cor.all,cluster_rows = F,cluster_cols = F,show_rownames = F,show_colnames = #F,main = "Corrlation Matrix for All Extracted Radiomic Features") #随缘画的
#heatmap(correlationMatrix,labRow = F,labCol = F,main = "Corrlation Matrix for All Extracted #Radiomic Features", xlab = "The 1 ~ 1219 Originally Extracted Features",ylab = "The 1 ~ 1219 #Originally Extracted Features",dendrogram='none') 
cor.lowcorr = cor(features[,-c(1,2,highlyCorrelated)])
colnames(cor.lowcorr) = as.character(seq(1:length(colnames(cor.lowcorr))))
rownames(cor.lowcorr) = as.character(seq(1:length(colnames(cor.lowcorr))))

ggcorrplot(cor.lowcorr,
           #type = "lower",
           outline.color = "white",
           colors = c("#6D9EC1", "white", "#E46726")) +
  labs(x = "Features ID",
       title = "Correlation Matrix for the 47 Selected Radiomics Features at the First Stage",
       y = "Features ID") +
  theme(plot.title = element_text(hjust=.5))# selected features

LASSO

library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
#features.lasso = features %>% .[features_into_account] #datase for lasso
#x = features.lasso %>% as.matrix() %>% scale()
#y = features$pcr %>% as.factor()

features.lasso = features[,-highlyCorrelated] %>%
  .[,-1] %>%
  mutate(pcr = features$pcr) %>% 
  select(pcr,everything())

x = features.lasso %>%  .[,-1] %>% as.matrix() %>% scale()
y = features$pcr %>% as.factor()

levels(y) = c("N","Y")

ctrl1  = trainControl(method = "cv", number = 10,
                      summaryFunction = prSummary,
                       # prSummary needs calculated class probs
                       classProbs = T)

set.seed(888)
lasso.fit = train(x, y,method = "glmnet",
                  tuneGrid = expand.grid(alpha = 1,lambda =exp(seq(-10, -5, length=1000))),
        #preProc = c("center", "scale"),
        metric = "ROC",
        trControl = ctrl1)
lasso.fit$bestTune
##     alpha       lambda
## 151     1 9.618384e-05
#lasso.fit$finalModel$beta
ggplot(lasso.fit,highlight = T) +
  theme_bw() +
  labs(title = "Variable Selection Process via Logistic LASSO Regression") +
  theme(plot.title = element_text(hjust = .5))

#trace plots for lasso variable selection
library(glmnet)

plot.glmnet(lasso.fit$finalModel, xvar = "lambda", label = T,
     main = "Trace Plot for the Coefficients")

lasso.fit$bestTune
##     alpha       lambda
## 151     1 9.618384e-05

calculate radscores

coef = data.frame(as.matrix(coef(lasso.fit$finalModel,lasso.fit$bestTune$lambda)))

coef = coef %>%
  mutate(coef = rownames(coef)) %>%
  rename("value" = "X1") %>%
  filter(value !=0)

coef %>%
  filter(value !=0) %>%
  nrow() # number of useful predictors
## [1] 55
# The selected predictors are:
predictors = coef$coef[-1] #plus intercept 1+12
# the next step is to calculate radscores:
predictors_val = coef$value[-1]

# the next step is to calculate radscores:
feature.matrix = features[predictors] %>% as.matrix() %>% scale()
coef.matrix = predictors_val %>% as.matrix()
radscore = feature.matrix %*% coef.matrix + coef$value[1]

# construct a new dataset for future prediction
data.pred = clinical %>%
  mutate(radscore = as.vector(radscore),
         PCR = factor(PCR))

write.csv(data.pred,"./CTscore177.csv")

check radscore

data.pred %>%
  mutate(id = 1:nrow(data.pred)) %>%
  ggplot(aes(x = PCR,
             y = radscore, fill = PCR)) +
  geom_boxplot() +
  theme_bw()

data.pred %>%
  arrange(PCR) %>%
  mutate(id = 1:nrow(data.pred)) %>%
  ggplot(aes(x = reorder(id,radscore), y = radscore,
              fill = PCR)) +
  geom_col() +
  theme_classic() +
  labs(title = "Radscores for the 177 Patients",
       x = "Patients",
       y = "Radscore") + 
  theme(plot.title = element_text(hjust = .5),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggsci::scale_fill_jama(labels = c("PCR = 0", "PCR = 1"))

organizing data

#rf.score = read_csv("../rf.score_simple.csv") %>% as.matrix() %>% .[,-1]
data.pred = data.pred %>%
  mutate(Age = factor(Age),
         cT = ifelse(cT == 2, 3, cT),
         cT = factor(cT),
         cN = factor(cN),
         cTNM = factor(cTNM),
         MRF = as.character(MRF),
         `Tumor length` = factor(`Tumor length`),
         Distance = factor(Distance),
         CEA = factor(CEA),
         Differentiation = factor(Differentiation),
         Group = ifelse(Group == "C", "B", Group)
         #rf.score = rf.score
         ) # 有3个2,这样很可能导致无法交叉验证,先变成3看看
#2020.5.10 加入了rf.score
str(data.pred$PCR)
##  Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...

reg

set.seed(888)
ctrl = trainControl(method = "repeatedcv", number = 10, repeats = 5,
                    classProbs = TRUE,
                    summaryFunction = twoClassSummary)

levels(data.pred$PCR) = c("N", "Y")


rowTrain = createDataPartition(y = data.pred$PCR,
                               p = 2/3, # 划分训练验证
                               list = F)

rad.fit = train(x = data.pred[rowTrain,c(2:13,15)], # radscore and others
                y = data.pred$PCR[rowTrain],
                method = "glm",
                metric = "ROC",
                trControl = ctrl
                ) #radscore and others (logistic

x.lasso = model.matrix(PCR ~. ,data.pred[rowTrain,c(2:15)])
y.lasso = data.pred[rowTrain,c(2:15)]$PCR

lasso.predict.fit = train(x = x.lasso, # radscore with others (lasso)
                y = y.lasso,
                method = "glmnet",
                tuneGrid = expand.grid(alpha = 1,lambda = exp(seq(-5, 5, length=1000))),
                preProc = c("center", "scale"),
                trControl = ctrl)

rad_and_group.fit = train(x = data.pred[rowTrain,c(4,15)], # only radscore and group(logistic)
                y = data.pred$PCR[rowTrain],
                method = "glm",
                metric = "ROC",
                trControl = ctrl
                )

all_other.fit = train(x = data.pred[rowTrain,c(2:13)], # only others
                y = data.pred$PCR[rowTrain],
                method = "glm",
                metric = "ROC",
                trControl = ctrl
                )

test.pred.prob = predict(rad.fit, newdata = data.pred[-rowTrain,] , type = "prob")[,2]
test.pred = rep("N", length(test.pred.prob ))
test.pred[test.pred.prob > 0.5] = "Y"

caret::confusionMatrix(data = factor(test.pred,levels = c("N","Y")),
                       reference = data.pred$PCR[-rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 43  5
##          Y  5  5
##                                           
##                Accuracy : 0.8276          
##                  95% CI : (0.7057, 0.9141)
##     No Information Rate : 0.8276          
##     P-Value [Acc > NIR] : 0.5834          
##                                           
##                   Kappa : 0.3958          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.50000         
##             Specificity : 0.89583         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.89583         
##              Prevalence : 0.17241         
##          Detection Rate : 0.08621         
##    Detection Prevalence : 0.17241         
##       Balanced Accuracy : 0.69792         
##                                           
##        'Positive' Class : Y               
## 

random forest

rf.grid = expand.grid(mtry = 1:6,
                       splitrule = "gini",
                       min.node.size = 20:30
                      )
set.seed(888)
rf.fit = train(PCR ~ ., data.pred[rowTrain,-1], 
                #subset = rowTrain,
                method = "ranger",
                tuneGrid = rf.grid,
                metric = "ROC",
                trControl = ctrl)

#rf.fit = train(PCR ~ ., data.pred[rowTrain,-1], 
#                #subset = rowTrain,
#                method = "rf",
#                ntree = 1000,
#                tuneGrid = data.frame(mtry = 1:6),
#                metric = "ROC",
#                trControl = ctrl)


rf.fit$bestTune
##    mtry splitrule min.node.size
## 56    6      gini            20
ggplot(rf.fit, highlight = TRUE) +
  theme_minimal() +
  labs(title = "Tuning Parameters for Random Forest") +
  theme(plot.title = element_text(hjust = .5)) +
  ggsci::scale_color_lancet()
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

set.seed(888)
data.pred.rf = data.pred
colnames(data.pred.rf) = make.names(colnames(data.pred.rf))
rf.final = ranger(PCR ~ ., 
                  data.pred.rf[rowTrain,-1], 
                  mtry = 6, 
                  splitrule = "gini",
                  min.node.size = 29,
                  importance = "impurity")

# var importance
vip::vip(rf.final) + theme_bw() + ggtitle("Variable Importance - RF")

barplot(sort(ranger::importance(rf.final), decreasing = FALSE),
        las = 2, 
        horiz = TRUE, 
        cex.names = 0.6,
        col = colorRampPalette(colors = c("black","gold"))(20))

# explain prediction
# fixed a small bug
library(lime)
## 
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
## 
##     explain
new_obs = data.pred[-rowTrain,-c(1)][1:2,]
explainer.rf = lime(data.pred[rowTrain,-c(1)], rf.fit)
explanation.rf = lime::explain(new_obs, explainer.rf, n_features = 8, labels = "Y")
plot_features(explanation.rf)

plot_explanations(explanation.rf)

#attention!!! => actual result: case 1 is N, case 2 is Y

boosting

gbmB.grid = expand.grid(n.trees = c(3500,4000,4500),
                        interaction.depth = 1:4,
                        shrinkage = c(0.0008,0.001,0.0015),
                        n.minobsinnode = 10:20) # 2 originally
set.seed(888)
# Binomial loss function
gbmB.fit = train(PCR ~ ., data.pred[rowTrain,-1],
                 tuneGrid = gbmB.grid,
                 trControl = ctrl,
                 method = "gbm",
                 distribution = "bernoulli",
                 metric = "ROC",
                 verbose = FALSE)

ggplot(gbmB.fit, highlight = T) +
  theme_minimal() +
  labs(title = "Tuning Parameters for Gradient Boosting Model") +
  theme(plot.title = element_text(hjust = .5)) #+

  #ggsci::scale_color_lancet()
gbmB.fit$bestTune
##   n.trees interaction.depth shrinkage n.minobsinnode
## 2    4000                 1     8e-04             10
vip::vip(gbmB.fit$finalModel) + theme_bw() + ggtitle("Variable Importance - GBM")

summary(gbmB.fit$finalModel, las = 2, cBars = 19, cex.names = 0.6)

##                                 var     rel.inf
## radscore                   radscore 89.91369181
## `Tumor thickness` `Tumor thickness`  5.95875618
## GroupB                       GroupB  2.96606778
## Age1                           Age1  0.28694248
## CEA1                           CEA1  0.24241520
## Differentiation1   Differentiation1  0.18522160
## `Tumor length`1     `Tumor length`1  0.12323893
## Distance1                 Distance1  0.12047243
## MRF1                           MRF1  0.08252402
## SexMale                     SexMale  0.04490061
## cT4                             cT4  0.04110177
## cN1                             cN1  0.02046010
## cTNM3                         cTNM3  0.01420709
explainer.gbm = lime(data.pred[rowTrain,-1], gbmB.fit)
explanation.gbm = explain(new_obs, explainer.gbm, n_features = 8,labels = "Y")
plot_features(explanation.gbm)

Decision Tree

#data.pred.tree = data.pred
#colnames(data.pred.tree) = make.names(colnames(data.pred.tree))
#library(rpart.plot)
#tree.fit = train(PCR ~ ., data.pred.tree[rowTrain,-1],
#                 method = "rpart",
#                 metric = "ROC",
#                 trControl = ctrl,
#                 tuneGrid = data.frame(cp = exp(seq(-9,-1, len = 100))))
#
#ggplot(tree.fit,highlight = T)
#rpart.plot(tree.fit$finalModel)

resample

res = resamples(list(only_radscore = rad.fit,
                     rad_and_group = rad_and_group.fit,
                     all_other = all_other.fit,
                     rf = rf.fit,
                     gbmB = gbmB.fit),
                     metric = "ROC")
summary(res)
## 
## Call:
## summary.resamples(object = res)
## 
## Models: only_radscore, rad_and_group, all_other, rf, gbmB 
## Number of resamples: 50 
## 
## ROC 
##               Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## only_radscore 0.30 0.8888889 0.9500000 0.9093333    1.00    1    0
## rad_and_group 0.55 0.9111111 1.0000000 0.9322222    1.00    1    0
## all_other     0.00 0.4000000 0.5250000 0.5326667    0.65    1    0
## rf            0.55 0.9000000 0.9833333 0.9151111    1.00    1    0
## gbmB          0.55 0.8916667 0.9833333 0.9330370    1.00    1    0
## 
## Sens 
##                    Min. 1st Qu. Median      Mean 3rd Qu. Max. NA's
## only_radscore 0.7000000     0.9      1 0.9564444       1    1    0
## rad_and_group 0.7777778     1.0      1 0.9708889       1    1    0
## all_other     0.7777778     0.9      1 0.9551111       1    1    0
## rf            0.8000000     1.0      1 0.9815556       1    1    0
## gbmB          0.7000000     0.9      1 0.9326667       1    1    0
## 
## Spec 
##               Min. 1st Qu.    Median      Mean 3rd Qu.      Max. NA's
## only_radscore    0     0.5 0.5000000 0.5733333       1 1.0000000    0
## rad_and_group    0     0.5 1.0000000 0.7200000       1 1.0000000    0
## all_other        0     0.0 0.0000000 0.0600000       0 0.6666667    0
## rf               0     0.5 0.5000000 0.6166667       1 1.0000000    0
## gbmB             0     0.5 0.5833333 0.6933333       1 1.0000000    0
bwplot(res)

radscores by groups

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following object is masked from 'package:ModelMetrics':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc = roc(data.pred$PCR,data.pred$radscore)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc,
     auc.polygon = TRUE,
     grid=c(0.1, 0.2), 
     grid.col = c("green", "red"), 
     max.auc.polygon = TRUE,
     auc.polygon.col = "skyblue", 
     print.thres=TRUE)

# thre_value = -1.270

data.tbl = data.pred %>%
  mutate(radscore_cat = ifelse(radscore >= -1.270, "High", "Low"))

table 1 grouped by radscores

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
label = rep("Primary Cohort",177)
label[-rowTrain] = "Validation Cohort"
data.tbl = data.tbl %>%
  mutate(cohort = label)

primary = data.tbl[rowTrain,]
validation = data.tbl[-rowTrain,]


control = arsenal::tableby.control(test = T,
                                    numeric.stats = c("meansd","medianq1q3","range"),
                                    #cat.stats = c("countrowpct"),
                                    ordered.stats = c("Nmiss"),
                                    digits = 2)

tbl_primary = arsenal::tableby(PCR~ Age + Sex + Group + cT + cN +cTNM + MRF + `Tumor length` + `Tumor thickness` + Distance + CEA  + Differentiation + radscore, data = primary,control = control)

tbl_validation = arsenal::tableby(PCR~ Age + Sex + Group + cT + cN +cTNM + MRF + `Tumor length` + `Tumor thickness` + Distance + CEA  + Differentiation + radscore, data = validation, control = control)

summary(tbl_primary,text = T) %>%
  knitr::kable(booktabs = T, caption = "Descriptive Statistics") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Descriptive Statistics
N (N=98) Y (N=21) Total (N=119) p value
Age 0.783
  • 0
67 (68.4%) 15 (71.4%) 82 (68.9%)
  • 1
31 (31.6%) 6 (28.6%) 37 (31.1%)
Sex 0.926
  • Female
29 (29.6%) 6 (28.6%) 35 (29.4%)
  • Male
69 (70.4%) 15 (71.4%) 84 (70.6%)
Group 0.040
  • A
31 (31.6%) 2 (9.5%) 33 (27.7%)
  • B
67 (68.4%) 19 (90.5%) 86 (72.3%)
cT 0.492
  • 3
85 (86.7%) 17 (81.0%) 102 (85.7%)
  • 4
13 (13.3%) 4 (19.0%) 17 (14.3%)
cN 0.594
  • 0
24 (24.5%) 4 (19.0%) 28 (23.5%)
  • 1
74 (75.5%) 17 (81.0%) 91 (76.5%)
cTNM 0.531
  • 2
25 (25.5%) 4 (19.0%) 29 (24.4%)
  • 3
73 (74.5%) 17 (81.0%) 90 (75.6%)
MRF 0.735
  • 0
69 (70.4%) 14 (66.7%) 83 (69.7%)
  • 1
29 (29.6%) 7 (33.3%) 36 (30.3%)
Tumor length 0.478
  • 0
21 (21.4%) 6 (28.6%) 27 (22.7%)
  • 1
77 (78.6%) 15 (71.4%) 92 (77.3%)
Tumor thickness 0.054
  • Mean (SD)
15.70 (6.53) 19.00 (9.20) 16.28 (7.14)
  • Median (Q1, Q3)
15.00 (11.25, 18.00) 15.00 (13.00, 22.00) 15.00 (12.00, 19.00)
  • Range
8.00 - 51.00 12.00 - 47.00 8.00 - 51.00
Distance 0.401
  • 0
37 (37.8%) 10 (47.6%) 47 (39.5%)
  • 1
61 (62.2%) 11 (52.4%) 72 (60.5%)
CEA 0.567
  • 0
67 (68.4%) 13 (61.9%) 80 (67.2%)
  • 1
31 (31.6%) 8 (38.1%) 39 (32.8%)
Differentiation 0.226
  • 0
42 (42.9%) 6 (28.6%) 48 (40.3%)
  • 1
56 (57.1%) 15 (71.4%) 71 (59.7%)
radscore < 0.001
  • Mean (SD)
-4.33 (4.34) 0.51 (2.37) -3.47 (4.45)
  • Median (Q1, Q3)
-3.39 (-5.35, -2.23) 0.67 (-0.71, 1.82) -2.94 (-5.03, -1.34)
  • Range
-37.69 - 0.07 -6.15 - 5.38 -37.69 - 5.38
summary(tbl_validation,text = T) %>%
  knitr::kable(booktabs = T, caption = "Descriptive Statistics") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Descriptive Statistics
N (N=48) Y (N=10) Total (N=58) p value
Age 0.318
  • 0
32 (66.7%) 5 (50.0%) 37 (63.8%)
  • 1
16 (33.3%) 5 (50.0%) 21 (36.2%)
Sex 0.882
  • Female
18 (37.5%) 4 (40.0%) 22 (37.9%)
  • Male
30 (62.5%) 6 (60.0%) 36 (62.1%)
Group 0.414
  • A
13 (27.1%) 4 (40.0%) 17 (29.3%)
  • B
35 (72.9%) 6 (60.0%) 41 (70.7%)
cT 0.841
  • 3
37 (77.1%) 8 (80.0%) 45 (77.6%)
  • 4
11 (22.9%) 2 (20.0%) 13 (22.4%)
cN 0.113
  • 0
10 (20.8%) 0 (0.0%) 10 (17.2%)
  • 1
38 (79.2%) 10 (100.0%) 48 (82.8%)
cTNM 0.359
  • 2
11 (22.9%) 1 (10.0%) 12 (20.7%)
  • 3
37 (77.1%) 9 (90.0%) 46 (79.3%)
MRF 0.743
  • 0
36 (75.0%) 7 (70.0%) 43 (74.1%)
  • 1
12 (25.0%) 3 (30.0%) 15 (25.9%)
Tumor length 0.414
  • 0
13 (27.1%) 4 (40.0%) 17 (29.3%)
  • 1
35 (72.9%) 6 (60.0%) 41 (70.7%)
Tumor thickness 0.417
  • Mean (SD)
15.69 (5.41) 14.20 (4.16) 15.43 (5.21)
  • Median (Q1, Q3)
15.00 (12.75, 18.00) 13.50 (11.25, 16.00) 15.00 (12.00, 17.75)
  • Range
7.50 - 35.00 9.00 - 22.00 7.50 - 35.00
Distance 0.462
  • 0
18 (37.5%) 5 (50.0%) 23 (39.7%)
  • 1
30 (62.5%) 5 (50.0%) 35 (60.3%)
CEA 0.653
  • 0
30 (62.5%) 7 (70.0%) 37 (63.8%)
  • 1
18 (37.5%) 3 (30.0%) 21 (36.2%)
Differentiation 0.810
  • 0
22 (45.8%) 5 (50.0%) 27 (46.6%)
  • 1
26 (54.2%) 5 (50.0%) 31 (53.4%)
radscore < 0.001
  • Mean (SD)
-4.22 (3.22) 0.38 (1.36) -3.43 (3.46)
  • Median (Q1, Q3)
-3.62 (-5.96, -2.29) 0.09 (-0.58, 1.18) -3.04 (-5.09, -1.17)
  • Range
-16.75 - 1.59 -1.22 - 2.85 -16.75 - 2.85

Making ROC curves for logistic regression Models

for training set

# rad.fit ROC (radscore and others)
rad.fit.pred.train.prob = predict(rad.fit,newdata = data.pred[rowTrain,],type = "prob")[,2]# select pos resp
rad.fit.pred.train = rep("N",length(rad.fit.pred.train.prob))
rad.fit.pred.train[rad.fit.pred.train.prob > 0.5] = "Y"

roc.rad.fit = roc(data.pred$PCR[rowTrain], rad.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad.fit,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rad.fit)
## 95% CI: 0.8945-1 (DeLong)
#plot(smooth(rad_and_group.fit.fit),
     #add = T,
     #col = 4)
# only radscore and groups
rad_group.fit.pred.train.prob = predict(rad_and_group.fit ,newdata = data.pred[rowTrain,],type = "prob")[,2]# select pos resp
rad_group.fit.pred.train = rep("N",length(rad_group.fit.pred.train.prob))
rad_group.fit.pred.train[rad_group.fit.pred.train.prob > 0.5] = "Y"

roc.rad_group.fit = roc(data.pred$PCR[rowTrain], rad_group.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad_group.fit,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rad_group.fit)
## 95% CI: 0.8565-1 (DeLong)
# RF train ROC
rf.fit.pred.train.prob = predict(rf.fit,newdata = data.pred[rowTrain,-1],type = "prob")[,2]# select pos resp
rf.fit.pred.train = rep("N",length(rf.fit.pred.train.prob))
rf.fit.pred.train[rf.fit.pred.train.prob > 0.5] = "Y"

roc.rf.fit = roc(data.pred$PCR[rowTrain], rf.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rf.fit,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rf.fit)
## 95% CI: 0.9772-1 (DeLong)
# Boosting train ROC
gbmB.fit.pred.train.prob = predict(gbmB.fit, newdata = data.pred[rowTrain,-1], type = "prob")[,2]# select pos resp
gbmB.fit.pred.train = rep("N",length(gbmB.fit.pred.train.prob))
gbmB.fit.pred.train[gbmB.fit.pred.train.prob > 0.5] = "Y"

roc.gbmB.fit = roc(data.pred$PCR[rowTrain], gbmB.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.gbmB.fit,
     legacy.axes = T,
     print.auc = T)

ci.auc(roc.gbmB.fit)
## 95% CI: 0.921-0.9973 (DeLong)
plot(roc.rad.fit, print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.1,auc.polygon.col="gray", grid = c(0.5, 0.3), max.auc.polygon=TRUE,legen = T,main = "Primary Cohort")
plot.roc(roc.rf.fit,add=T,col="red", print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.2)
plot.roc(roc.gbmB.fit,add=T,col="blue",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.3,)
plot.roc(roc.rad_group.fit,add=T,col="yellow",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.4)
legend("bottomright",legend = c("radscore with other clinical variables",
                  "radscore with `group`",
                  "random forest",
                  "gradient boosting model",
                  "SVM"),
       col = c("black","yellow","red","blue"),lwd = 4,cex = 0.35)

for validation set

# rad.fit ROC (radscore and others)
rad.fit.pred.test.prob = predict(rad.fit,newdata = data.pred[-rowTrain,],type = "prob")[,2]# select pos resp
rad.fit.pred.test = rep("N",length(rad.fit.pred.test.prob))
rad.fit.pred.test[rad.fit.pred.test.prob > 0.5] = "Y"

roc.rad.fit.test = roc(data.pred$PCR[-rowTrain], rad.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad.fit.test,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rad.fit.test)
## 95% CI: 0.6934-1 (DeLong)
#plot(smooth(rad_and_group.fit.fit),
     #add = T,
     #col = 4)

# only radscore and groups
rad_group.fit.pred.test.prob = predict(rad_and_group.fit ,newdata = data.pred[-rowTrain,],type = "prob")[,2]# select pos resp
rad_group.fit.pred.test = rep("N",length(rad_group.fit.pred.test.prob))
rad_group.fit.pred.test[rad_group.fit.pred.test.prob > 0.5] = "Y"

roc.rad_group.fit.test = roc(data.pred$PCR[-rowTrain], rad_group.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad_group.fit.test,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rad_group.fit.test)
## 95% CI: 0.9051-1 (DeLong)
# RF train ROC
rf.fit.pred.test.prob = predict(rf.fit,newdata = data.pred[-rowTrain,-1],type = "prob")[,2]# select pos resp
rf.fit.pred.test = rep("N",length(rf.fit.pred.test.prob))
rf.fit.pred.test[rf.fit.pred.test.prob > 0.5] = "Y"

roc.rf.fit.test = roc(data.pred$PCR[-rowTrain], rf.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rf.fit.test,
     legacy.axes = T,
     print.auc = T,
     print.thres = T)

ci.auc(roc.rf.fit.test)
## 95% CI: 0.9007-1 (DeLong)
# Boosting test ROC
gbmB.fit.pred.test.prob = predict(gbmB.fit,newdata = data.pred[-rowTrain,-1], type = "prob")[,2]# select pos resp
gbmB.fit.pred.test = rep("N",length(gbmB.fit.pred.test.prob))
gbmB.fit.pred.test[gbmB.fit.pred.test.prob > 0.5] = "Y"

roc.gbmB.fit.test = roc(data.pred$PCR[-rowTrain], gbmB.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.gbmB.fit.test,
     legacy.axes = T,
     print.auc = T)

ci.auc(roc.gbmB.fit.test)
## 95% CI: 0.9065-1 (DeLong)
plot(roc.rad.fit.test, print.auc=TRUE,print.auc.x = 0.3,print.auc.y= 0.1,grid = c(0.5, 0.3), max.auc.polygon=TRUE,legen = T,main = "Validation Cohort")
plot.roc(roc.rf.fit.test,add=T,col="red", print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.2)
plot.roc(roc.gbmB.fit.test,add=T,col="blue",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.3,)
plot.roc(roc.rad_group.fit.test,add=T,col="yellow",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.4)
legend("bottomright",legend = c("radscore with other clinical variables",
                  "radscore with `group`",
                  "random forest",
                  "gradient boosting model",
                  "SVM"),
       col = c("black","yellow","red","blue"),lwd = 4,cex = 0.35)

logsitic regression’s caliberation & HL test

#?calibration()
trainProbs = data.frame(
obs = data.pred$PCR[rowTrain],
rad = 1-rad.fit.pred.train.prob,
rf = 1-rf.fit.pred.train.prob,
rad_and_group = 1-rad_group.fit.pred.train.prob,
gbm = 1-gbmB.fit.pred.train.prob)

testProbs = data.frame(
  obs = data.pred$PCR[-rowTrain],
  rad = 1-rad.fit.pred.test.prob,
  rf = 1-rf.fit.pred.test.prob,
  rad_and_group = 1-rad_group.fit.pred.test.prob,
  gbm = 1-gbmB.fit.pred.test.prob
)

cal.int = calibration(obs ~ rad +
                        rf + 
                        rad_and_group + gbm,
                        data = trainProbs)
cal.ext = calibration(obs ~ rad + 
                        rf + rad_and_group +gbm 
                      , data = testProbs)

plot(cal.int,type = "l",auto.key = list(columns = 4,
                                          lines = TRUE,
                                          points = FALSE),
     main = "Calibration Curves for Primary Cohort")

plot(cal.ext,type = "l",auto.key = list(columns = 4,
                                          lines = TRUE,
                                          points = FALSE),
     main = "Calibration Curves for Validation Cohort")

results of the logsitic regression

# radscore and other variables
summary(rad.fit$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1176  -0.3160  -0.0614  -0.0039   3.6809  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -4.95194    2.54104  -1.949  0.05132 .  
## Age1                 0.81823    0.99107   0.826  0.40903    
## SexMale              0.01206    1.04495   0.012  0.99079    
## GroupB               3.98457    1.64768   2.418  0.01559 *  
## cT4                  0.85291    1.79913   0.474  0.63545    
## cN1                 -8.46698 2399.54616  -0.004  0.99718    
## cTNM3               10.13855 2399.54576   0.004  0.99663    
## MRF1                -1.17660    1.56353  -0.753  0.45173    
## `Tumor length`1     -1.79580    1.27591  -1.407  0.15929    
## `Tumor thickness`    0.12757    0.08174   1.561  0.11856    
## Distance1           -0.33524    0.99333  -0.337  0.73575    
## CEA1                 1.58655    1.04535   1.518  0.12909    
## Differentiation1    -0.98389    0.92295  -1.066  0.28642    
## radscore             1.27366    0.33449   3.808  0.00014 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 110.908  on 118  degrees of freedom
## Residual deviance:  43.837  on 105  degrees of freedom
## AIC: 71.837
## 
## Number of Fisher Scoring iterations: 15
# radscore and group
summary(rad_and_group.fit$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4319  -0.3875  -0.1122  -0.0136   3.5259  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5485     0.9728  -1.592   0.1114    
## GroupB        2.0499     1.0634   1.928   0.0539 .  
## radscore      1.0916     0.2495   4.374 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 110.908  on 118  degrees of freedom
## Residual deviance:  53.267  on 116  degrees of freedom
## AIC: 59.267
## 
## Number of Fisher Scoring iterations: 7

Sen,Spec,PPV,NPV

# validation results
caret::confusionMatrix(factor(rad.fit.pred.test),reference = data.pred$PCR[-rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 43  5
##          Y  5  5
##                                           
##                Accuracy : 0.8276          
##                  95% CI : (0.7057, 0.9141)
##     No Information Rate : 0.8276          
##     P-Value [Acc > NIR] : 0.5834          
##                                           
##                   Kappa : 0.3958          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.50000         
##             Specificity : 0.89583         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.89583         
##              Prevalence : 0.17241         
##          Detection Rate : 0.08621         
##    Detection Prevalence : 0.17241         
##       Balanced Accuracy : 0.69792         
##                                           
##        'Positive' Class : Y               
## 
caret::confusionMatrix(factor(rad_group.fit.pred.test),reference = data.pred$PCR[-rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 46  5
##          Y  2  5
##                                          
##                Accuracy : 0.8793         
##                  95% CI : (0.767, 0.9501)
##     No Information Rate : 0.8276         
##     P-Value [Acc > NIR] : 0.1949         
##                                          
##                   Kappa : 0.5201         
##                                          
##  Mcnemar's Test P-Value : 0.4497         
##                                          
##             Sensitivity : 0.50000        
##             Specificity : 0.95833        
##          Pos Pred Value : 0.71429        
##          Neg Pred Value : 0.90196        
##              Prevalence : 0.17241        
##          Detection Rate : 0.08621        
##    Detection Prevalence : 0.12069        
##       Balanced Accuracy : 0.72917        
##                                          
##        'Positive' Class : Y              
## 
caret::confusionMatrix(factor(rf.fit.pred.test),reference = data.pred$PCR[-rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 47  5
##          Y  1  5
##                                           
##                Accuracy : 0.8966          
##                  95% CI : (0.7883, 0.9611)
##     No Information Rate : 0.8276          
##     P-Value [Acc > NIR] : 0.1073          
##                                           
##                   Kappa : 0.5693          
##                                           
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.50000         
##             Specificity : 0.97917         
##          Pos Pred Value : 0.83333         
##          Neg Pred Value : 0.90385         
##              Prevalence : 0.17241         
##          Detection Rate : 0.08621         
##    Detection Prevalence : 0.10345         
##       Balanced Accuracy : 0.73958         
##                                           
##        'Positive' Class : Y               
## 
caret::confusionMatrix(factor(gbmB.fit.pred.test),reference = data.pred$PCR[-rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 46  4
##          Y  2  6
##                                           
##                Accuracy : 0.8966          
##                  95% CI : (0.7883, 0.9611)
##     No Information Rate : 0.8276          
##     P-Value [Acc > NIR] : 0.1073          
##                                           
##                   Kappa : 0.6063          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.9583          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 0.9200          
##              Prevalence : 0.1724          
##          Detection Rate : 0.1034          
##    Detection Prevalence : 0.1379          
##       Balanced Accuracy : 0.7792          
##                                           
##        'Positive' Class : Y               
## 
#http://vassarstats.net/clin1.html#return
# primary results
caret::confusionMatrix(factor(rad.fit.pred.train),reference = data.pred$PCR[rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 95  5
##          Y  3 16
##                                           
##                Accuracy : 0.9328          
##                  95% CI : (0.8718, 0.9705)
##     No Information Rate : 0.8235          
##     P-Value [Acc > NIR] : 0.0004702       
##                                           
##                   Kappa : 0.7597          
##                                           
##  Mcnemar's Test P-Value : 0.7236736       
##                                           
##             Sensitivity : 0.7619          
##             Specificity : 0.9694          
##          Pos Pred Value : 0.8421          
##          Neg Pred Value : 0.9500          
##              Prevalence : 0.1765          
##          Detection Rate : 0.1345          
##    Detection Prevalence : 0.1597          
##       Balanced Accuracy : 0.8656          
##                                           
##        'Positive' Class : Y               
## 
caret::confusionMatrix(factor(rad_group.fit.pred.train),reference = data.pred$PCR[rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 96  6
##          Y  2 15
##                                           
##                Accuracy : 0.9328          
##                  95% CI : (0.8718, 0.9705)
##     No Information Rate : 0.8235          
##     P-Value [Acc > NIR] : 0.0004702       
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : 0.2888444       
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.9796          
##          Pos Pred Value : 0.8824          
##          Neg Pred Value : 0.9412          
##              Prevalence : 0.1765          
##          Detection Rate : 0.1261          
##    Detection Prevalence : 0.1429          
##       Balanced Accuracy : 0.8469          
##                                           
##        'Positive' Class : Y               
## 
caret::confusionMatrix(factor(rf.fit.pred.train),reference = data.pred$PCR[rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 98  7
##          Y  0 14
##                                          
##                Accuracy : 0.9412         
##                  95% CI : (0.8826, 0.976)
##     No Information Rate : 0.8235         
##     P-Value [Acc > NIR] : 0.0001479      
##                                          
##                   Kappa : 0.7671         
##                                          
##  Mcnemar's Test P-Value : 0.0233422      
##                                          
##             Sensitivity : 0.6667         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9333         
##              Prevalence : 0.1765         
##          Detection Rate : 0.1176         
##    Detection Prevalence : 0.1176         
##       Balanced Accuracy : 0.8333         
##                                          
##        'Positive' Class : Y              
## 
caret::confusionMatrix(factor(gbmB.fit.pred.train),reference = data.pred$PCR[rowTrain],
                       positive = "Y")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  N  Y
##          N 96  6
##          Y  2 15
##                                           
##                Accuracy : 0.9328          
##                  95% CI : (0.8718, 0.9705)
##     No Information Rate : 0.8235          
##     P-Value [Acc > NIR] : 0.0004702       
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : 0.2888444       
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.9796          
##          Pos Pred Value : 0.8824          
##          Neg Pred Value : 0.9412          
##              Prevalence : 0.1765          
##          Detection Rate : 0.1261          
##    Detection Prevalence : 0.1429          
##       Balanced Accuracy : 0.8469          
##                                           
##        'Positive' Class : Y               
## 

univariate logistic for table 2

var = names(clinical)[-c(1,14)]
for (i in c(2:13,ncol(data.pred))) {
  dat = data.pred[,c(i,14)]
  #print(names(dat)[1])
  res = glm(PCR ~ ., data = dat, family = binomial(link = "logit"))
  print("Point.est")
  print(exp(coef(res)[2]))
  print(exp(confint(res)[2,]))
  print("P.val")
  print(coef(summary(res))[2,4])
  }
## [1] "Point.est"
##     Age1 
## 1.158511 
##    2.5 %   97.5 % 
## 0.499636 2.577730 
## [1] "P.val"
## [1] 0.7229797
## [1] "Point.est"
##   SexMale 
## 0.9969697 
##     2.5 %    97.5 % 
## 0.4435814 2.3652537 
## [1] "P.val"
## [1] 0.9942765
## [1] "Point.est"
##   GroupB 
## 1.797386 
##     2.5 %    97.5 % 
## 0.7284238 5.1105784 
## [1] "P.val"
## [1] 0.2305823
## [1] "Point.est"
##  cT4 
## 1.22 
##     2.5 %    97.5 % 
## 0.4178221 3.1395735 
## [1] "P.val"
## [1] 0.6946095
## [1] "Point.est"
##      cN1 
## 2.049107 
##    2.5 %   97.5 % 
## 0.736501 7.289028 
## [1] "P.val"
## [1] 0.2084891
## [1] "Point.est"
##    cTNM3 
## 1.701818 
##    2.5 %   97.5 % 
## 0.652946 5.318086 
## [1] "P.val"
## [1] 0.3109151
## [1] "Point.est"
##     MRF1 
## 1.219512 
##     2.5 %    97.5 % 
## 0.5115039 2.7593941 
## [1] "P.val"
## [1] 0.6413772
## [1] "Point.est"
## `Tumor length`1 
##          0.6375 
##     2.5 %    97.5 % 
## 0.2784818 1.5329310 
## [1] "P.val"
## [1] 0.2964886
## [1] "Point.est"
## `Tumor thickness` 
##          1.035864 
##     2.5 %    97.5 % 
## 0.9803204 1.0907868 
## [1] "P.val"
## [1] 0.1843168
## [1] "Point.est"
## Distance1 
## 0.6446886 
##     2.5 %    97.5 % 
## 0.2943432 1.4160067 
## [1] "P.val"
## [1] 0.2699332
## [1] "Point.est"
##     CEA1 
## 1.088776 
##     2.5 %    97.5 % 
## 0.4701839 2.4182741 
## [1] "P.val"
## [1] 0.8373333
## [1] "Point.est"
## Differentiation1 
##         1.419069 
##     2.5 %    97.5 % 
## 0.6440263 3.2673379 
## [1] "P.val"
## [1] 0.3941766
## [1] "Point.est"
## radscore 
## 3.052979 
##    2.5 %   97.5 % 
## 2.139603 4.840260 
## [1] "P.val"
## [1] 6.061884e-08

multi-variate